# Kalmia analyze fruit size at end of season
# Using fruit sizes calculated from image segmentation in Python with opencv
#  Callin Switzer
# 20 October 2016
# 10 Feb 2017 Update: Conducted Statistical Modeling with LMER and GLMER

Read in data

ipak <- function(pkg){
     new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
     if(length(new.pkg)) install.packages(new.pkg, dependencies = TRUE)
     sapply(pkg, require, character.only = TRUE)
}

packages <- c("ggplot2", 'lme4', 'plyr', 'influence.ME', 'sjPlot', 'multcomp', 'lsmeans', 'Matrix')
ipak(packages)
##      ggplot2         lme4         plyr influence.ME       sjPlot 
##         TRUE         TRUE         TRUE         TRUE         TRUE 
##     multcomp      lsmeans       Matrix 
##         TRUE         TRUE         TRUE
theme_set(theme_classic())

kfrt <- read.csv("kalmiaFruitFinal.csv", stringsAsFactors = FALSE)
nrow(kfrt[kfrt$dia_mm == 20.0,]) # number of images taken
## [1] 92
# clean and process data
kfrt <- kfrt[kfrt$dia_mm != 20.0, ] # 20 is the reference object in segmented images
nrow(kfrt) # number of total fruits measured
## [1] 1305
# label treatmens and access numbers
kfrt$trt <- sapply(X = 1:nrow(kfrt), FUN = function(x) strsplit(kfrt$plantNum[x], split = "__")[[1]][2])
kfrt$accessNum <- sapply(X = 1:nrow(kfrt), FUN = function(x) strsplit(kfrt$plantNum[x], split = "__")[[1]][1])

kfrt$trt <- mapvalues(kfrt$trt, c(1,2,3,4,5), c("Bagged", "Bagged & Selfed", "Unbagged", 
                                                "Unbagged & Outcrossed", "Unbagged_2"))

Add plants with 0 count to the dataset

# count number of fruits
counts = as.data.frame(table(kfrt$plantNum))
counts$trt = sapply(X = 1:nrow(counts), FUN = function(x) strsplit(as.character(counts$Var1[x]), split = "__")[[1]][2])

counts$trt <- mapvalues(counts$trt, c(1,2,3,4,5), c("Bagged", "Bagged & Selfed", "Unbagged", 
                                                "Unbagged & Outcrossed", "Unbagged_2"))


counts$accessNum = sapply(X = 1:nrow(counts), FUN = function(x) strsplit(as.character(counts$Var1[x]), split = "__")[[1]][1])


# add in the trts and accession numbers that had counts of 0
# read in error-checked datasheet
kalNotes <- read.csv("KalmiaDailyDatasheets_ErrorChecked.csv")

# get the accession numbers I should have
accNumsHave <- unique(kalNotes$Plant.Number)
accNumsHave <- accNumsHave[!(accNumsHave %in% c("", "Plant Number"))]

#change formatting to match above
accNumsHave <- gsub("#", "", accNumsHave)
accNumsHave <- gsub("\ ", "_", accNumsHave)
accNumsHave <- gsub("\\-", "_", accNumsHave)
accNumsHave <- gsub("\\*", "_", accNumsHave)
accNumsHave <-toupper(accNumsHave)

shouldHave <- as.data.frame(t(sapply(accNumsHave, function(x) as.character(paste(x, c(1:4), sep = "__")))))
row.names(shouldHave) <- NULL
shouldHave1 <- c(as.character(shouldHave[, 1]), 
                 as.character(shouldHave[, 2]), 
                 as.character(shouldHave[, 3]), 
                 as.character(shouldHave[, 4]))

# find missing ones -- this is the ones that
# are in the daily datasheet that aren't in the fruit measurements
missingTrts <- setdiff(shouldHave1,unique(kfrt$plantNum[kfrt$trt != "5"]))

# this should be 0 -- the ones from the fruit measurements that aren't in the daily datasheet
setdiff(unique(kfrt$plantNum[kfrt$trt != "Unbagged_2"]), shouldHave1)
## character(0)
# here's the list of plants that had their bags/tags go missing during the experiment (meaning that
# I was unable to collect fruits)
# note: "677_66_MASS__1" was the only plant that was missing a tag during the final collection
# that wasn't missing in one of my previous checks. 
NAList <- c("1129_74_E__4", "1129_74_C__4", "39_60_A__3", "685_70_A__2", "677_66_MASS__1")

# note: this ignores trt#5 which is the same as #3
ZeroFruitPlants <- missingTrts[!(missingTrts %in% NAList)]

zfdf <- data.frame(Var1 = ZeroFruitPlants, 
                   Freq = 0, 
                   trt = sapply(X = 1:length(ZeroFruitPlants), 
                                FUN = function(x) strsplit(as.character(ZeroFruitPlants[x]), 
                                                           split = "__")[[1]][2]),
                   accessNum = sapply(X = 1:length(ZeroFruitPlants), 
                                            FUN = function(x) strsplit(as.character(ZeroFruitPlants[x]), 
                                                                       split = "__")[[1]][1])
                   )

zfdf$trt <- mapvalues(zfdf$trt, c(1,2,3,4,5), c("Bagged", "Bagged & Selfed", "Unbagged", 
                                                "Unbagged & Outcrossed", "Unbagged_2"))
## The following `from` values were not present in `x`: 3, 4, 5
countFin <- rbind(counts, zfdf)


# final fruit counts for the fruits collected at the end of the experiment
countFin <- countFin[order(countFin$accessNum, countFin$trt, decreasing = FALSE), ]

# change labels from unbagged to conntrol
countFin$trt <- mapvalues(countFin$trt, c("Bagged", "Bagged & Selfed", "Unbagged", 
                                                "Unbagged & Outcrossed", "Unbagged_2")
                          , c("Bagged", "Bagged & Selfed", "Control", 
                                                "Unbagged & Outcrossed", "Control_2"))
# change levels, so that control is first
countFin$trt <- factor(countFin$trt, levels = c("Control","Control_2", "Bagged", "Bagged & Selfed", 
                                                "Unbagged & Outcrossed"))

Visualize counts of fruits

ggplot(countFin, aes(x = trt , y = Freq, fill = trt)) + 
     geom_boxplot() +
    # geom_violin()+
     
     labs(y = "Number of fruits", x = "Treatment") + 
     scale_fill_brewer(name = "Treatment", palette = "Set1")

saveDir <- "/Users/callinswitzer/Dropbox/ExperSummer2016/Kalmia/Manuscript/Media/"
ggsave(paste0(saveDir, "KalmiaFruitNumber.pdf"), width = 10, height = 8)

Visualize counts of fruits (ignore treatment #5)

ggplot(countFin[!(countFin$trt %in% 'Control_2'), ], aes(x = trt , y = Freq, fill = trt)) + 
     geom_boxplot() +
     labs(y = "Number of fruits", x = "Treatment") +
     theme(axis.text.x = element_text(angle = 35, hjust = 1), 
           legend.position = 'none') + 
     scale_fill_brewer(name = "Treatment", palette = "Set1")

ggsave(paste0(saveDir, "KalmiaFruitNumber_trt1_4Only.pdf"), width = 5, height = 4)

Use GLMM’s to find differences in number of fruits

First, summarize the dataset that I will be using

# I'm ignoring control#2, because it was originally intended for another experiment (analysis wasn't planned)
cf1 <- countFin[countFin$trt != "Control_2",]
nrow(countFin) # number of total counts, including Control2
## [1] 103
sum(cf1$Freq) # total number of fruits in the analysis of count
## [1] 1035
# Number of counts, excluding control_2
# this is different from the number of photos
# because I didn't take photos on 0-count plants
nrow(cf1) 
## [1] 83
data.frame(table(cf1$trt)) # number of plants in each treatment that we are analyzing
##                    Var1 Freq
## 1               Control   21
## 2             Control_2    0
## 3                Bagged   21
## 4       Bagged & Selfed   21
## 5 Unbagged & Outcrossed   20
data.frame(table(cf1$accessNum)) # shows number of counts per plant -- 4 means that we counted all four treatments
##           Var1 Freq
## 1    1129_74_A    4
## 2    1129_74_B    4
## 3    1129_74_C    3
## 4    1129_74_E    3
## 5    1129_74_F    4
## 6    12_2006_A    4
## 7  128_96_MASS    4
## 8  132_98_MASS    4
## 9     150_58_A    4
## 10    232_46_A    4
## 11     35_40_C    4
## 12     39_60_A    3
## 13    425_74_D    4
## 14    440_66_A    4
## 15     46_18_A    4
## 16 572_64_MASS    4
## 17    624_64_B    4
## 18    673_69_B    4
## 19 677_66_MASS    3
## 20    685_70_A    3
## 21    721_79_B    4
## 22 UNLABELED_1    4
# shows which treatments / plants are missing from analysis
# this is different from the plants that just had 0 counts
data.frame(table(interaction(cf1$accessNum , cf1$trt))) 
##                                  Var1 Freq
## 1                   1129_74_A.Control    1
## 2                   1129_74_B.Control    1
## 3                   1129_74_C.Control    1
## 4                   1129_74_E.Control    1
## 5                   1129_74_F.Control    1
## 6                   12_2006_A.Control    1
## 7                 128_96_MASS.Control    1
## 8                 132_98_MASS.Control    1
## 9                    150_58_A.Control    1
## 10                   232_46_A.Control    1
## 11                    35_40_C.Control    1
## 12                    39_60_A.Control    0
## 13                   425_74_D.Control    1
## 14                   440_66_A.Control    1
## 15                    46_18_A.Control    1
## 16                572_64_MASS.Control    1
## 17                   624_64_B.Control    1
## 18                   673_69_B.Control    1
## 19                677_66_MASS.Control    1
## 20                   685_70_A.Control    1
## 21                   721_79_B.Control    1
## 22                UNLABELED_1.Control    1
## 23                1129_74_A.Control_2    0
## 24                1129_74_B.Control_2    0
## 25                1129_74_C.Control_2    0
## 26                1129_74_E.Control_2    0
## 27                1129_74_F.Control_2    0
## 28                12_2006_A.Control_2    0
## 29              128_96_MASS.Control_2    0
## 30              132_98_MASS.Control_2    0
## 31                 150_58_A.Control_2    0
## 32                 232_46_A.Control_2    0
## 33                  35_40_C.Control_2    0
## 34                  39_60_A.Control_2    0
## 35                 425_74_D.Control_2    0
## 36                 440_66_A.Control_2    0
## 37                  46_18_A.Control_2    0
## 38              572_64_MASS.Control_2    0
## 39                 624_64_B.Control_2    0
## 40                 673_69_B.Control_2    0
## 41              677_66_MASS.Control_2    0
## 42                 685_70_A.Control_2    0
## 43                 721_79_B.Control_2    0
## 44              UNLABELED_1.Control_2    0
## 45                   1129_74_A.Bagged    1
## 46                   1129_74_B.Bagged    1
## 47                   1129_74_C.Bagged    1
## 48                   1129_74_E.Bagged    1
## 49                   1129_74_F.Bagged    1
## 50                   12_2006_A.Bagged    1
## 51                 128_96_MASS.Bagged    1
## 52                 132_98_MASS.Bagged    1
## 53                    150_58_A.Bagged    1
## 54                    232_46_A.Bagged    1
## 55                     35_40_C.Bagged    1
## 56                     39_60_A.Bagged    1
## 57                    425_74_D.Bagged    1
## 58                    440_66_A.Bagged    1
## 59                     46_18_A.Bagged    1
## 60                 572_64_MASS.Bagged    1
## 61                    624_64_B.Bagged    1
## 62                    673_69_B.Bagged    1
## 63                 677_66_MASS.Bagged    0
## 64                    685_70_A.Bagged    1
## 65                    721_79_B.Bagged    1
## 66                 UNLABELED_1.Bagged    1
## 67          1129_74_A.Bagged & Selfed    1
## 68          1129_74_B.Bagged & Selfed    1
## 69          1129_74_C.Bagged & Selfed    1
## 70          1129_74_E.Bagged & Selfed    1
## 71          1129_74_F.Bagged & Selfed    1
## 72          12_2006_A.Bagged & Selfed    1
## 73        128_96_MASS.Bagged & Selfed    1
## 74        132_98_MASS.Bagged & Selfed    1
## 75           150_58_A.Bagged & Selfed    1
## 76           232_46_A.Bagged & Selfed    1
## 77            35_40_C.Bagged & Selfed    1
## 78            39_60_A.Bagged & Selfed    1
## 79           425_74_D.Bagged & Selfed    1
## 80           440_66_A.Bagged & Selfed    1
## 81            46_18_A.Bagged & Selfed    1
## 82        572_64_MASS.Bagged & Selfed    1
## 83           624_64_B.Bagged & Selfed    1
## 84           673_69_B.Bagged & Selfed    1
## 85        677_66_MASS.Bagged & Selfed    1
## 86           685_70_A.Bagged & Selfed    0
## 87           721_79_B.Bagged & Selfed    1
## 88        UNLABELED_1.Bagged & Selfed    1
## 89    1129_74_A.Unbagged & Outcrossed    1
## 90    1129_74_B.Unbagged & Outcrossed    1
## 91    1129_74_C.Unbagged & Outcrossed    0
## 92    1129_74_E.Unbagged & Outcrossed    0
## 93    1129_74_F.Unbagged & Outcrossed    1
## 94    12_2006_A.Unbagged & Outcrossed    1
## 95  128_96_MASS.Unbagged & Outcrossed    1
## 96  132_98_MASS.Unbagged & Outcrossed    1
## 97     150_58_A.Unbagged & Outcrossed    1
## 98     232_46_A.Unbagged & Outcrossed    1
## 99      35_40_C.Unbagged & Outcrossed    1
## 100     39_60_A.Unbagged & Outcrossed    1
## 101    425_74_D.Unbagged & Outcrossed    1
## 102    440_66_A.Unbagged & Outcrossed    1
## 103     46_18_A.Unbagged & Outcrossed    1
## 104 572_64_MASS.Unbagged & Outcrossed    1
## 105    624_64_B.Unbagged & Outcrossed    1
## 106    673_69_B.Unbagged & Outcrossed    1
## 107 677_66_MASS.Unbagged & Outcrossed    1
## 108    685_70_A.Unbagged & Outcrossed    1
## 109    721_79_B.Unbagged & Outcrossed    1
## 110 UNLABELED_1.Unbagged & Outcrossed    1

Create a model

m1 <- glmer(Freq ~ trt + (1|accessNum), family = poisson, data = cf1)
summary(m1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: Freq ~ trt + (1 | accessNum)
##    Data: cf1
## 
##      AIC      BIC   logLik deviance df.resid 
##    699.1    711.2   -344.5    689.1       78 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.8567 -1.2750 -0.3495  1.1718  5.2621 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  accessNum (Intercept) 0.4623   0.6799  
## Number of obs: 83, groups:  accessNum, 22
## 
## Fixed effects:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               2.18834    0.16080  13.609   <2e-16 ***
## trtBagged                -0.90399    0.12302  -7.348    2e-13 ***
## trtBagged & Selfed        0.18797    0.08836   2.127   0.0334 *  
## trtUnbagged & Outcrossed  0.74707    0.08352   8.945   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) trtBgg trtB&S
## trtBagged   -0.219              
## trtBggd&Slf -0.301  0.395       
## trtUnbggd&O -0.334  0.418  0.583
# calculate LRT for trt
m2 <- update(m1, .~. - trt)
anova(m1, m2) # highly significant
## Data: cf1
## Models:
## m2: Freq ~ (1 | accessNum)
## m1: Freq ~ trt + (1 | accessNum)
##    Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)    
## m2  2 961.54 966.38 -478.77   957.54                             
## m1  5 699.08 711.17 -344.54   689.08 268.46      3  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model diagnostics

# diagnostics
# qq plot
qqnorm(resid(m1), main = "")
qqline(resid(m1)) # good, one possible outlier

# residual plot
plot(fitted(m1), resid(m1), xlab = "fitted", ylab = "residuals")
abline(0,0)

# possibly one or two outliers


# QQPlot for group-level effects
qqnorm(ranef(m1)$accessNum[[1]], main="Normal Q-Q plot for random effects")
qqline(ranef(m1)$accessNum[[1]]) # looks good

infl <- influence(m1, obs = TRUE) # takes a while to calculate
plot(infl, which = 'cook') # nothing above 1

# visualize model: 
sjp.lmer(m1, type = 'fe')

sjp.lmer(m1, type = 're', sort = TRUE) # plot random effects to look for outliers
## Plotting random effects...

#check assumptions of model 
overdisp_fun <- function(model) {
  ## number of variance parameters in 
  ##   an n-by-n variance-covariance matrix
  vpars <- function(m) {
    nrow(m)*(nrow(m)+1)/2
  }
  model.df <- sum(sapply(VarCorr(model),vpars))+length(fixef(model))
  rdf <- nrow(model.frame(model))-model.df
  rp <- residuals(model,type="pearson")
  Pearson.chisq <- sum(rp^2)
  prat <- Pearson.chisq/rdf
  pval <- pchisq(Pearson.chisq, df=rdf, lower.tail=FALSE)
  c(chisq=Pearson.chisq,ratio=prat,rdf=rdf,p=pval)
}

overdisp_fun(m1) 
##        chisq        ratio          rdf            p 
## 2.613667e+02 3.350855e+00 7.800000e+01 1.230697e-21
# here's another way to check for overdispersion
residDev <- sum(residuals(m1, type = 'deviance')^2) # calculate residual deviance
# this ratio should be about 1 -- larger than 1 suggests overdispersion
residDev / df.residual(m1) 
## [1] 3.918246

Use negative binomial model, since the poisson model is overdispersed

m1.1 <- glmer.nb(Freq ~ trt + (1|accessNum), data = cf1)
summary(m1.1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: Negative Binomial(2.3758)  ( log )
## Formula: Freq ~ trt + (1 | accessNum)
##    Data: cf1
## 
##      AIC      BIC   logLik deviance df.resid 
##    568.2    582.7   -278.1    556.2       77 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.44802 -0.59594 -0.05952  0.47425  2.46419 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  accessNum (Intercept) 0.4418   0.6647  
## Number of obs: 83, groups:  accessNum, 22
## 
## Fixed effects:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               2.23813    0.21591  10.366  < 2e-16 ***
## trtBagged                -1.01254    0.24831  -4.078 4.55e-05 ***
## trtBagged & Selfed        0.01286    0.23231   0.055  0.95585    
## trtUnbagged & Outcrossed  0.76456    0.22895   3.339  0.00084 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) trtBgg trtB&S
## trtBagged   -0.487              
## trtBggd&Slf -0.519  0.469       
## trtUnbggd&O -0.539  0.447  0.465
# get overall p-value for treatment
m2.1 <- update(m1.1, .~. - trt)
anova(m1.1, m2.1, test = 'LRT')
## Data: cf1
## Models:
## m2.1: Freq ~ (1 | accessNum)
## m1.1: Freq ~ trt + (1 | accessNum)
##      Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)    
## m2.1  3 613.21 620.47 -303.61   607.21                             
## m1.1  6 568.15 582.67 -278.08   556.15 51.058      3  4.756e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Negative binomial model diagnostics

overdisp_fun(m1.1) 
##      chisq      ratio        rdf          p 
## 57.3160985  0.7348218 78.0000000  0.9621017
# here's another way to check for overdispersion
residDev <- sum(residuals(m1.1, type = 'deviance')^2) # calculate residual deviance
# this ratio should be about 1 -- larger than 1 suggests overdispersion
residDev / df.residual(m1.1) 
## [1] 1.103789
# diagnostics
# qq plot
qqnorm(resid(m1.1), main = "")
qqline(resid(m1.1)) # good, one possible outlier

# residual plot
plot(fitted(m1.1), resid(m1.1), xlab = "fitted", ylab = "residuals")
abline(0,0) # residuals look better for this model

# QQPlot for group-level effects
qqnorm(ranef(m1.1)$accessNum[[1]], main="Normal Q-Q plot for random effects")
qqline(ranef(m1.1)$accessNum[[1]]) # looks good

infl <- influence(m1.1, obs = TRUE) # takes a while to calculate
plot(infl, which = 'cook') # nothing above 1

# visualize model: 
sjp.lmer(m1.1, type = 'fe')

sjp.lmer(m1.1, type = 're', sort = TRUE) # plot random effects to look for outliers
## Plotting random effects...

# get estimated dispersion parameter for NB Model
getME(m1.1, "glmer.nb.theta") # 2.37
## [1] 2.375765
# post-hoc pairwise comparisons with adjusted p-values
# from documentation: 
# test = adjusted() multiple test procedures as specified by the type argument
# to adjusted: "single-step" denotes adjusted p values as computed
# from the joint normal or t distribution of the z statistics (default),
summary(glht(m1.1, lsm(pairwise ~ trt)), test=adjusted("single-step")) # pairwise tests for fruit counts
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: glmer(formula = Freq ~ trt + (1 | accessNum), data = cf1, family = negative.binomial(theta = 2.37576455660569))
## 
## Linear Hypotheses:
##                                              Estimate Std. Error z value
## Control - Bagged == 0                         1.01254    0.24831   4.078
## Control - Bagged & Selfed == 0               -0.01286    0.23231  -0.055
## Control - Unbagged & Outcrossed == 0         -0.76456    0.22895  -3.339
## Bagged - Bagged & Selfed == 0                -1.02541    0.24792  -4.136
## Bagged - Unbagged & Outcrossed == 0          -1.77711    0.25143  -7.068
## Bagged & Selfed - Unbagged & Outcrossed == 0 -0.75170    0.23857  -3.151
##                                              Pr(>|z|)    
## Control - Bagged == 0                         < 0.001 ***
## Control - Bagged & Selfed == 0                0.99994    
## Control - Unbagged & Outcrossed == 0          0.00466 ** 
## Bagged - Bagged & Selfed == 0                 < 0.001 ***
## Bagged - Unbagged & Outcrossed == 0           < 0.001 ***
## Bagged & Selfed - Unbagged & Outcrossed == 0  0.00880 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

Visualize annotated plot for fruit counts

count_pub <- within(countFin, {
     trt <- mapvalues(trt, from = c("Control","Control_2", "Bagged", "Bagged & Selfed", 
                                                "Unbagged & Outcrossed"), 
                      to = c("Control","Control_2", "Bagged", "Bagged\n & Selfed", 
                                                "Outcrossed"))
})

count_pub <- droplevels(count_pub)


ggplot(count_pub[!(count_pub$trt %in% 'Control_2'), ], aes(x = trt , y = Freq+ 0.1)) + 
     geom_boxplot(width = 0.2, fill = 'grey80') +
     labs(y = "Number of fruits", x = "Treatment") +
     theme(axis.text.x = element_text(angle = 45, hjust = 1), 
           legend.position = 'none') + 
     # annotate
     annotate(geom="text", x=c(1,2,3,4), y=c(70, 70, 70, 70) + 10, label=c("a", "b", "a", "c"),
               color="black") + 
     scale_y_log10(breaks = c(1, 10, 100), limits = c(0.1, 100))

ggsave(paste0(saveDir, "KalmiaFruitNumber_logScale.pdf"), width = 3, height = 4)

Visualize CI’s for each of the groups

# refit m1.1 with treatments in alphabetical order (b/c order isn't preserved in model matrix)
cf1.1 <- within(cf1, {
     trt = as.factor(as.character(cf1$trt))
})

# refit model with different reference levels
m1.1 <- glmer.nb(Freq ~ trt + (1|accessNum), data = cf1.1)
pframe <- data.frame(trt = levels(droplevels(cf1.1$trt)))
pframe$Freq <- 0
pp <- predict(m1.1, newdata = pframe, re.form=NA, type = 'response') # re.form sets all random effects to 0


### Calculate CI's (using bootstrap, not accounting for random effects)
mm <- model.matrix(terms(m1.1), pframe)
predFun<-function(.) exp(mm%*%fixef(.) )
bb<-bootMer(m1.1,FUN=predFun,nsim=200) #do this 200 times
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.00625995 (tol =
## 0.001, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.0073433 (tol =
## 0.001, component 1)
#as we did this 200 times the 95% CI will be bordered by the 5th and 195th value
bb_se<-apply(bb$t,2,function(x) quantile(x, probs = c(0.025, 0.975)))
pframe$blo<-bb_se[1,]
pframe$bhi<-bb_se[2,]
pframe$predMean <- pp

pframe$median = tapply(cf1.1$Freq, INDEX = cf1.1$trt, median)



pframe$trt <- mapvalues(pframe$trt, from = c("Control", "Bagged", "Bagged & Selfed", 
                                                "Unbagged & Outcrossed"),
                        to = c("Control", "Bagged", "Bagged\n & Selfed", 
                                                "Outcrossed"))
pframe$trt <- relevel(pframe$trt, ref = "Control")

#plot 95% confidence intervals
# "Mean and bootstrap CI based on fixed-effects uncertainty ONLY"
number_ticks <- function(n) {function(limits) pretty(limits, n)}
g0 <- ggplot(pframe, aes(x=trt, y=predMean))+
     geom_point()+ 
     labs(y = "Number of fruits", x = "Treatment") + 
     geom_errorbar(aes(ymin = blo, ymax = bhi), width = 0.1)+
     scale_y_log10(limits =c(1,60), breaks = number_ticks(6)) + 
     theme(axis.text.x = element_text(angle = 45, hjust = 1), 
           legend.position = 'none') +
     # annotate
     annotate(geom="text", x=c(1,2,3,4), y=c(70, 70, 70, 70) - 10, label=c("a", "b", "a", "c"),
               color="black") 
g0

ggsave(paste0(saveDir, "KalmiaFruitNumber_BSCI_logScale.pdf"), width = 3, height = 4)


## Bootstrap CI on linear scale -- not that great!
g1 <- ggplot(pframe, aes(x=trt, y=predMean))+
     geom_point()+ 
     labs(y = "Number of Fruits", x = "Treatment") + 
     geom_errorbar(aes(ymin = blo, ymax = bhi), width = 0.1)+ 
     theme(axis.text.x = element_text(angle = 45, hjust = 1), 
           legend.position = 'none') +
     # annotate
     annotate(geom="text", x=c(1,2,3,4), y=c(70, 70, 70, 70) - 30, label=c("a", "b", "a", "c"),
               color="black") 
g1

ggsave(paste0(saveDir, "KalmiaFruitNumber_BSCI_LinearScale.pdf"), width = 3, height = 4)


cp <- droplevels(count_pub[count_pub$trt != 'Control_2',])

# this might actually be the best way
ggplot(cp, aes(x = trt , y = Freq)) + 
     geom_boxplot(width = 0.2, fill = 'grey80') +
     labs(y = "Number of fruits", x = "Treatment") +
     theme(axis.text.x = element_text(angle = 45, hjust = 1), 
           legend.position = 'none') + 
     # annotate
     annotate(geom="text", x=c(1,2,3,4), y=c(70, 70, 70, 70) + 1, label=c("a", "b", "a", "c"),
               color="black") 

ggsave(paste0(saveDir, "KalmiaFruitNumber_LinearScale.pdf"), width = 3, height = 4)

Visualize fruit size

# calculate fruit size by plant
sizeDF_mean <- as.data.frame(tapply(kfrt$dia_mm, INDEX = kfrt$plantNum, mean))
colnames(sizeDF_mean) = "meanFrtSz"
sizeDF_mean$trt = sapply(X = 1:length(sizeDF_mean$meanFrtSz), 
                          FUN = function(x) strsplit(as.character(row.names(sizeDF_mean)[x]), 
                                                     split = "__")[[1]][2])

sizeDF_mean$trt <- mapvalues(sizeDF_mean$trt, c(1,2,3,4,5), c("Bagged", "Bagged & Selfed", "Control", 
                                                "Unbagged & Outcrossed", "Control_2"))

# reorder
sizeDF_mean$trt <- factor(sizeDF_mean$trt, levels = c("Control", "Control_2", "Bagged", "Bagged & Selfed",  
                                                "Unbagged & Outcrossed"))

sizeDF_mean$accessNum = sapply(X = 1:length(sizeDF_mean$meanFrtSz), 
                   FUN = function(x) strsplit(as.character(row.names(sizeDF_mean)[x]), 
                                              split = "__")[[1]][1])


ggplot(sizeDF_mean, aes(x = trt, y = meanFrtSz, fill = trt)) + 
     geom_boxplot(alpha = 0.5) + 
#      stat_summary(fun.y=mean, geom="line", aes(group = accessNum, color = accessNum))  + 
#      stat_summary(fun.y=mean, geom="point", aes(group = accessNum, color = accessNum)) + 
     geom_point(aes(color = trt))

ggplot(sizeDF_mean, aes(x = trt, y = meanFrtSz, fill = trt)) + 
     geom_boxplot() + 
     labs(y = "Mean Fruit Diameter (mm)", x = "Treatment") + 
     scale_fill_brewer(name = "Treatment", palette = "Set1")

ggsave(paste0(saveDir, "KalmiaFruitDiameter.pdf"), width = 10, height = 8)


# visualize , but exclude treatment 5
ggplot(sizeDF_mean[!(sizeDF_mean$trt %in% 'Control_2'), ], 
       aes(x = trt, y = meanFrtSz, fill = trt)) + 
     geom_boxplot() + 
     labs(y = "Mean Fruit Diameter (mm)", x = "Treatment") + 
     scale_fill_brewer(name = "Treatment", palette = "Set1") + 
     theme(axis.text.x = element_text(angle = 35, hjust = 1), 
          legend.position = 'none') 

ggsave(paste0(saveDir, "KalmiaFruitDiameter_trt14Only.pdf"), width = 5, height = 4)

Model Fruit Size with LMER

size1 <- within(kfrt, {
     trt <- mapvalues(trt, c("Bagged", "Bagged & Selfed", "Unbagged", 
                                                "Unbagged & Outcrossed", "Unbagged_2"), 
                      c("Bagged", "Bagged\n & Selfed", "Control", 
                                                "Outcrossed", "Control_2"))
})

size1 <- droplevels(size1[size1$trt != "Control_2",])

# get sample sizes for size dataset
nrow(size1) # total number of fruits in analysis for size
## [1] 1035
data.frame(table(size1$accessNum)) # total number of fruits per plant
##           Var1 Freq
## 1    1129_74_A   93
## 2    1129_74_B   36
## 3    1129_74_C   69
## 4    1129_74_E   43
## 5    1129_74_F   48
## 6    12_2006_A   22
## 7  128_96_MASS   64
## 8  132_98_MASS   17
## 9     150_58_A   39
## 10    232_46_A   13
## 11     35_40_C   70
## 12     39_60_A   37
## 13    425_74_D   94
## 14    440_66_A   53
## 15     46_18_A   19
## 16 572_64_MASS   31
## 17    624_64_B   43
## 18    673_69_B  124
## 19 677_66_MASS   69
## 20    685_70_A   23
## 21    721_79_B   10
## 22 UNLABELED_1   18
size1$trt <- relevel(as.factor(size1$trt), ref = "Control")
f1 <- lmer(dia_mm ~ trt + (1|accessNum), data = size1)
summary(f1) # final model for paper
## Linear mixed model fit by REML ['lmerMod']
## Formula: dia_mm ~ trt + (1 | accessNum)
##    Data: size1
## 
## REML criterion at convergence: 1859.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9129 -0.6393  0.0309  0.6617  3.4872 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  accessNum (Intercept) 0.1947   0.4412  
##  Residual              0.3264   0.5713  
## Number of obs: 1035, groups:  accessNum, 22
## 
## Fixed effects:
##                      Estimate Std. Error t value
## (Intercept)           4.35455    0.10232   42.56
## trtBagged            -0.51606    0.07280   -7.09
## trtBagged\n & Selfed  0.01786    0.05313    0.34
## trtOutcrossed         0.73112    0.04887   14.96
## 
## Correlation of Fixed Effects:
##             (Intr) trtBgg trtB&S
## trtBagged   -0.193              
## trtBggd&Slf -0.260  0.387       
## trtOutcrssd -0.306  0.403  0.535
# get p-value for trt
f2 <- update(f1, .~. - trt)
anova(f1, f2, "LRT")
## refitting model(s) with ML (instead of REML)
## Data: size1
## Models:
## f2: dia_mm ~ (1 | accessNum)
## f1: dia_mm ~ trt + (1 | accessNum)
##    Df    AIC    BIC   logLik deviance  Chisq Chi Df Pr(>Chisq)    
## f2  3 2233.5 2248.3 -1113.74   2227.5                             
## f1  6 1856.3 1886.0  -922.16   1844.3 383.17      3  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Fruit Size Diagnostics

# diagnostics
# qq plot
qqnorm(resid(f1), main = "")
qqline(resid(f1)) # good

# residual plot
plot(fitted(f1), residuals(f1, type = "deviance"), xlab = "fitted", ylab = "residuals")
abline(0,0) # residuals look better for this model

# QQPlot for group-level effects
qqnorm(ranef(f1)$accessNum[[1]], main="Normal Q-Q plot for random effects")
qqline(ranef(f1)$accessNum[[1]]) # looks good

infl <- influence(f1, obs = TRUE) # takes a while to calculate
plot(infl, which = 'cook') # nothing above 1

# visualize model: 
sjp.lmer(f1, type = 'fe')
## Computing p-values via Kenward-Roger approximation. Use `p.kr = FALSE` if computation takes too long.

# Best Linear Unbiased Predictors (BLUPs)
sjp.lmer(f1, type = 're', sort = TRUE) # plot random effects to look for outliers
## Plotting random effects...

# post-hoc pairwise comparisons with adjusted p-values
# from documentation: 
# test = adjusted() multiple test procedures as specified by the type argument
# to adjusted: "single-step" denotes adjusted p values as computed
# from the joint normal or t distribution of the z statistics (default),
summary(glht(f1, lsm(pairwise ~ trt)), test=adjusted("single-step")) # pairwise tests for fruit counts
## Loading required namespace: lmerTest
## Note: df set to 1018
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: lmer(formula = dia_mm ~ trt + (1 | accessNum), data = size1)
## 
## Linear Hypotheses:
##                                     Estimate Std. Error t value Pr(>|t|)
## Control - Bagged == 0                0.51606    0.07280   7.089   <1e-06
## Control - Bagged\n & Selfed == 0    -0.01786    0.05313  -0.336    0.987
## Control - Outcrossed == 0           -0.73112    0.04887 -14.960   <1e-06
## Bagged - Bagged\n & Selfed == 0     -0.53392    0.07163  -7.454   <1e-06
## Bagged - Outcrossed == 0            -1.24718    0.06945 -17.958   <1e-06
## Bagged\n & Selfed - Outcrossed == 0 -0.71326    0.04934 -14.456   <1e-06
##                                        
## Control - Bagged == 0               ***
## Control - Bagged\n & Selfed == 0       
## Control - Outcrossed == 0           ***
## Bagged - Bagged\n & Selfed == 0     ***
## Bagged - Outcrossed == 0            ***
## Bagged\n & Selfed - Outcrossed == 0 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

Visualize Fruit Size with annotations

# refit m1.1 with treatments in alphabetical order (b/c order isn't preserved in model matrix)
sf1<- within(size1, {
     trt = as.factor(as.character(size1$trt))
})

# refit model with different reference levels
f1 <- lmer(dia_mm ~ trt + (1|accessNum), data = sf1)
pframe <- data.frame(trt = levels(droplevels(sf1$trt)))
pframe$dia_mm = 0
pp <- predict(f1, newdata = pframe, re.form=NA, type = 'response') # re.form sets all random effects to 0


### Calculate CI's (using bootstrap, not accounting for random effects)
mm <- model.matrix(terms(f1), pframe)
predFun<-function(.) mm%*%fixef(.)
bb<-bootMer(f1,FUN=predFun,nsim=200) #do this 200 times
# get quantiles from bootstrap sample
bb_se<-apply(bb$t,2,function(x) quantile(x, probs = c(0.025, 0.975)))
pframe$blo<-bb_se[1,]
pframe$bhi<-bb_se[2,]
pframe$predMean <- pp
pframe # print frame, in case reporting in a table
##                 trt dia_mm      blo      bhi predMean
## 1            Bagged      0 3.616601 4.030417 3.838494
## 2 Bagged\n & Selfed      0 4.167830 4.541833 4.372409
## 3           Control      0 4.157682 4.552634 4.354550
## 4        Outcrossed      0 4.901570 5.270815 5.085670
pframe$trt <- relevel(pframe$trt, ref = "Control")

#plot 95% confidence intervals
# "Mean and bootstrap CI based on fixed-effects uncertainty ONLY"
g2 <- ggplot(pframe, aes(x=trt, y=predMean))+
     geom_point()+ 
     labs(y = "Fruit dia. (mm)", x = "Treatment") + 
     geom_errorbar(aes(ymin = blo, ymax = bhi), width = 0.1)+
     theme(axis.text.x = element_text(angle = 45, hjust = 1), 
           legend.position = 'none') +
     # annotate
     annotate(geom="text", x=c(1,2,3,4), y=c(5,5,5,5) + 0.7, label=c("a", "b", "a", "c"),
               color="black") 
g2

ggsave(paste0(saveDir, "KalmiaFruitDia_BSCI.pdf"), width = 3, height = 4)

 ## visualize raw data (mean fruit size per plant)

sdf1 <- droplevels(within(sizeDF_mean[!(sizeDF_mean$trt %in% 'Control_2'), ], {
     trt <- mapvalues(trt, c("Bagged", "Bagged & Selfed", "Control", 
                                                "Unbagged & Outcrossed", "Control_2"), 
                      c("Bagged", "Bagged\n & Selfed", "Control", 
                                                "Outcrossed", "Control_2"))
}))

# visualize fruit size
ggplot(sdf1, aes(x = trt, y = meanFrtSz)) + 
     geom_boxplot(width = 0.2, fill = 'grey80') +
     labs(y = "Fruit dia. (mm)", x = "Treatment") +
     theme(axis.text.x = element_text(angle = 45, hjust = 1), 
           legend.position = 'none') + 
     # annotate
     annotate(geom="text", x=c(1,2,3,4), y=c(5,5,5,5) + 2, label=c("a", "b", "a", "c"),
               color="black")

ggsave(paste0(saveDir, "KalmiaFruitDia.pdf"), width = 3, height = 4)


# compare two plots
g2 + geom_boxplot(data = sdf1, aes(x = trt, y = meanFrtSz), width = 0.2, alpha = 0)

Visualize data to compare fruit size with number of seeds

# read in data
sizeSeed <- read.csv("KalmiaFruitSizeAndSeeds.csv")

ggplot(sizeSeed, aes(x = Dia..mm., y = NumSeeds)) + 
     geom_point() + 
     stat_smooth(method = 'lm', formula = y ~ exp(x), se = F) + 
     labs(x = 'Fruit Diameter (mm)', y = 'Num Seeds in 1 Carpel')

ggsave(paste0(saveDir, "KalmiaFruitSeeds.pdf"), width = 5, height = 4)


# on log scale
ggplot(sizeSeed, aes(x = Dia..mm., y = NumSeeds)) + 
     geom_point() + 
     stat_smooth(method = 'lm', formula = y ~ x, se = F) + 
     scale_y_continuous(trans="log") + 
     labs(y = "log(e) number of seeds")

# GLM 
ss1 <- glm(NumSeeds ~ Dia..mm., family = poisson, data = sizeSeed)
summary(ss1)
## 
## Call:
## glm(formula = NumSeeds ~ Dia..mm., family = poisson, data = sizeSeed)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.1484  -1.3123   0.0538   1.1869   3.4399  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.22136    0.32470   0.682    0.495    
## Dia..mm.     0.63356    0.07019   9.027   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 156.083  on 18  degrees of freedom
## Residual deviance:  68.423  on 17  degrees of freedom
## AIC: 160.53
## 
## Number of Fisher Scoring iterations: 4
ss1$deviance / ss1$df.residual # overdispersed
## [1] 4.024875
# Neg Binomial Model 
ss2 <- glm.nb(NumSeeds ~ Dia..mm., data = sizeSeed)
summary(ss2) # final model for paper
## 
## Call:
## glm.nb(formula = NumSeeds ~ Dia..mm., data = sizeSeed, init.theta = 7.72630546, 
##     link = log)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.03651  -0.83952   0.02891   0.59012   1.60618  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.2739     0.5660   0.484    0.628    
## Dia..mm.      0.6215     0.1292   4.811  1.5e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(7.7263) family taken to be 1)
## 
##     Null deviance: 43.384  on 18  degrees of freedom
## Residual deviance: 18.964  on 17  degrees of freedom
## AIC: 135.72
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  7.73 
##           Std. Err.:  3.54 
## 
##  2 x log-likelihood:  -129.72
# get overall test stat for dia
ss3 <-update(ss2, .~. - Dia..mm.)
anova(ss2, ss3) # overall p-value for Dia..mm.
## Likelihood ratio tests of Negative Binomial Models
## 
## Response: NumSeeds
##      Model    theta Resid. df    2 x log-lik.   Test    df LR stat.
## 1        1 2.828473        18       -145.5067                      
## 2 Dia..mm. 7.726305        17       -129.7204 1 vs 2     1 15.78629
##        Pr(Chi)
## 1             
## 2 7.091441e-05
## visualize CI's for the two different models

newD <- data.frame(Dia..mm. = seq(2, 6, length.out = 100))
nbPreds = predict(ss2, newD, type = 'response', se.fit = TRUE)
nd1 <- data.frame(newD, nppred = nbPreds$fit, npse = nbPreds$se.fit)
predsPois <- predict(ss1, newD, type = 'response', se.fit = TRUE)


n2 <- data.frame(nd1, poisPred = predsPois$fit, poisse = predsPois$se.fit)
n2$npUpper <- with(n2,  nppred + 2 * npse)
n2$npLower <- with(n2, nppred - 2 * npse)
n2$pUpper <- with(n2,  poisPred + 2 * poisse)
n2$pLower <- with(n2, poisPred - 2 * poisse)

# compare NB errors vs. Pois errors
ggplot(sizeSeed, aes(x = Dia..mm., y = NumSeeds)) + 
     geom_point() + 
     geom_line(data = n2, aes(x = Dia..mm., y = poisPred), color = 'blue') + 
     geom_line(data = n2, aes(x = Dia..mm., y = pUpper), color = 'blue', linetype = 2) + 
     geom_line(data = n2, aes(x = Dia..mm., y = pLower), color = 'blue', linetype = 2) + 
     labs(x = 'Fruit Diameter (mm)', y = 'Num Seeds in 1 Carpel') + 
     geom_line(data = n2, aes(x = Dia..mm., y = nppred), color = 'red') + 
     geom_line(data = n2, aes(x = Dia..mm., y = npUpper), color = 'red', linetype = 3) + 
     geom_line(data = n2, aes(x = Dia..mm., y = npLower), color = 'red', linetype = 3)

## model diagnostics for negative binomial model
plot(ss2, which = 4) #cook's distance

plot(y = residuals(ss2, type = 'deviance'), x = ss2$fitted.values)
abline(h = 0)

# should be close to 1
ss2$deviance / ss2$df.residual
## [1] 1.115512
# get sample size
nrow(sizeSeed) # total number of individual fruits
## [1] 19
sum(sizeSeed$NumSeeds) # total number of seeds counted
## [1] 382